/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is Indexer.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (Original author)
*/
package org.terrier.indexing;
import gnu.trove.TObjectIntHashMap;
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import org.apache.log4j.Logger;
import org.terrier.structures.BasicDocumentIndexEntry;
import org.terrier.structures.DirectInvertedOutputStream;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.indexing.CompressingMetaIndexBuilder;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.InvertedIndexBuilder;
import org.terrier.structures.indexing.LexiconBuilder;
import org.terrier.structures.indexing.MetaIndexBuilder;
import org.terrier.structures.merging.BlockStructureMerger;
import org.terrier.structures.merging.StructureMerger;
import org.terrier.structures.postings.BasicIterablePosting;
import org.terrier.structures.postings.FieldIterablePosting;
import org.terrier.terms.SkipTermPipeline;
import org.terrier.terms.TermPipeline;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
/**
* <B>Properties:</b>
* <ul>
* <li><tt>termpipelines</tt> - the sequence of TermPipeline stages (e.g. <a href="../terms/Stopwords.html">Stopwords</a> removal and <a href="../terms/PorterStemmer.html">PorterStemmer</a>).
* <li><tt>termpipelines.skip</tt> - a list of tokens which should not be skipped from the term pipeline. If not set or empty, then none will be skipped.</li>
* <li><tt>indexing.max.tokens</tt> - The maximum number of tokens the indexer will attempt to index in a document.
* If 0, then all tokens will be indexed (default).</li>
* <li><tt>ignore.empty.documents</tt> - Assign empty documents documnent Ids. Default true</li>
* <li><tt>indexing.max.docs.per.builder</tt> - Maximum number of documents in an index before a new index is created, and merged later.
* <li><tt>indexing.builder.boundary.docnos</tt> - Docnos of documents that force the index being created to be completed, and a new index to be commenced. An alternative to <tt>indexing.max.docs.per.builder</tt>
* </ul>
* @author Craig Macdonald
*/
public abstract class Indexer
{
/** the logger for this class */
protected static final Logger logger = Logger.getLogger(Indexer.class);
/**
* The number of documents indexed with a set
* of builders. If a collection consists of
* more documents, then we need to create
* new builders and later merge the data
* structures. The corresponding property is
* <tt>indexing.max.docs.per.builder</tt> and the
* default value is <tt>18000000</tt> (18 million documents). If the property
* is set equal to zero, then there is no limit.
*/
protected int MAX_DOCS_PER_BUILDER = 0;
/**
* The maximum number of tokens in a document.
* If it is set to zero, then there is no limit
* in the number of tokens indexed for a document. Set by property <tt>indexing.max.tokens</tt>.
*/
protected int MAX_TOKENS_IN_DOCUMENT = 0;
/** The DOCNO of documents to force builder boundaries */
protected final HashSet<String> BUILDER_BOUNDARY_DOCUMENTS = new HashSet<String>();
/**
* Indicates whether field information should be saved in the
* created data structures.
*/
protected boolean useFieldInformation;
/**
* The default namespace for the term pipeline classes.
*/
private final static String PIPELINE_NAMESPACE = "org.terrier.terms.";
/**
* The first component of the term pipeline.
*/
protected TermPipeline pipeline_first;
/**
* Indicates whether an entry for empty documents is stored in the
* document index, or empty documents should be ignored.
*/
protected boolean IndexEmptyDocuments;
/**
* The builder that creates the direct index.
*/
protected DirectInvertedOutputStream directIndexBuilder;
/**
* The builder that creates the document index.
*/
protected DocumentIndexBuilder docIndexBuilder;
/**
* The builder that creates the inverted index.
*/
protected InvertedIndexBuilder invertedIndexBuilder;
/**
* The builder that creates the lexicon.
*/
protected LexiconBuilder lexiconBuilder;
protected MetaIndexBuilder metaBuilder;
/**
* The common prefix of the data structures filenames.
*/
protected String fileNameNoExtension;
/**
* The path in which the data structures are stored.
*/
protected String path;
/** The prefix of the data structures, ie the first part of the filename */
protected String prefix;
/** The index being worked on, denoted by path and prefix */
protected Index currentIndex = null;
protected String basicDirectIndexPostingIteratorClass = BasicIterablePosting.class.getName();
protected String fieldDirectIndexPostingIteratorClass = FieldIterablePosting.class.getName();
/** Creates an indexer at the location ApplicationSetup.TERRIER_INDEX_PATH and
* ApplicationSetup.TERRIER_INDEX_PREFIX
*/
public Indexer()
{
this(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
}
/**
* Creates an instance of the class. The generated data structures
* will be saved in the given path. The of the data is given by the prefix
* parameter.
* @param _path String the path where the generated data structures will be saved.
* @param _prefix String the filename that the data structures will have.
*/
public Indexer(String _path, String _prefix) {
this.fileNameNoExtension = ApplicationSetup.makeAbsolute(_prefix, _path);
this.prefix = _prefix;
this.path = _path;
}
/** Protected do-nothing constructor for use by child classes */
protected Indexer(long a, long b, long c) {
}
/** This method must be called by anything which directly extends Indexer.
* See: http://benpryor.com/blog/2008/01/02/dont-call-subclass-methods-from-a-superclass-constructor/
*/
protected void init()
{
FieldScore.init();
//init fields before contstructing pipeline
this.load_field_ids();
//construct pipeline using list specified in terrier.properties
//this object should be the last item in the pipeline
this.load_indexer_properties();
this.load_pipeline();
//load the docnos of any documents that should force builder boundaries
this.load_builder_boundary_documents();
}
/**
* An abstract method for creating the direct index, the document index
* and the lexicon for the given collections.
* @param collections Collection[] An array of collections to index
*/
public abstract void createDirectIndex(Collection[] collections);
/**
* An abstract method for creating the inverted index, given that the
* the direct index, the document index and the lexicon have
* already been created.
*/
public abstract void createInvertedIndex();
/**
* An abstract method that returns the last component
* of the term pipeline.
* @return TermPipeline the end of the term pipeline.
*/
protected abstract TermPipeline getEndOfPipeline();
/** mapping: field name -> field id, returns 0 for no mapping */
protected TObjectIntHashMap<String> fieldNames = new TObjectIntHashMap<String>(0);
/** the number of fields */
protected int numFields = 0;
protected MetaIndexBuilder createMetaIndexBuilder()
{
final String[] forwardMetaKeys = ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno").split("\\s*,\\s*");
final int[] metaKeyLengths = parseInts(ApplicationSetup.getProperty("indexer.meta.forward.keylens", "20").split("\\s*,\\s*"));
final String[] reverseMetaKeys = ApplicationSetup.getProperty("indexer.meta.reverse.keys", "docno").split("\\s*,\\s*");
return new CompressingMetaIndexBuilder(currentIndex, forwardMetaKeys, metaKeyLengths, reverseMetaKeys);
}
protected static final int[] parseInts(String[] in)
{
final int l = in.length;
final int[] rtr = new int[l];
for(int i=0;i<l;i++)
rtr[i] = Integer.parseInt(in[i]);
return rtr;
}
protected void load_indexer_properties()
{
IndexEmptyDocuments = !ApplicationSetup.IGNORE_EMPTY_DOCUMENTS;
MAX_TOKENS_IN_DOCUMENT = Integer.parseInt(ApplicationSetup.getProperty("indexing.max.tokens", "0"));
MAX_DOCS_PER_BUILDER = Integer.parseInt(ApplicationSetup.getProperty("indexing.max.docs.per.builder", "18000000"));
}
/** loads a mapping of field name -> field id */
protected void load_field_ids()
{
FieldScore.init();
useFieldInformation = FieldScore.USE_FIELD_INFORMATION;
if (! FieldScore.USE_FIELD_INFORMATION || FieldScore.FIELDS_COUNT == 0)
return;
numFields = FieldScore.FIELDS_COUNT;
// //logger.info("Indexer using " + numFields + " fields");
int i=0;
for (String f: FieldScore.FIELD_NAMES)
{
i++;
String[] tagNames = ArrayUtils.parseDelimitedString(f, "|");
for (String tag : tagNames)
fieldNames.put(tag, i);
}
}
/**
* Creates the term pipeline, as specified by the
* property <tt>termpipelines</tt> in the properties
* file. The default value of the property <tt>termpipelines</tt>
* is <tt>Stopwords,PorterStemmer</tt>. This means that we first
* remove stopwords and then apply Porter's stemming algorithm.
*/
@SuppressWarnings("unchecked")
protected void load_pipeline()
{
String[] pipes = ApplicationSetup.getProperty(
"termpipelines", "Stopwords,PorterStemmer").trim()
.split("\\s*,\\s*");
TermPipeline next = getEndOfPipeline();
final TermPipeline last = next;
TermPipeline tmp;
for(int i=pipes.length-1; i>=0; i--)
{
try{
String className = pipes[i];
if (className.length() == 0)
continue;
if (className.indexOf(".") < 0 )
className = PIPELINE_NAMESPACE + className;
else if (className.startsWith("uk.ac.gla.terrier"))
className = className.replaceAll("uk.ac.gla.terrier", "org.terrier");
Class pipeClass = Class.forName(className, false, this.getClass().getClassLoader());
tmp = (TermPipeline) (pipeClass.getConstructor(new Class[]{TermPipeline.class}).newInstance(new Object[] {next}));
next = tmp;
}catch (Exception e){
// //logger.warn("TermPipeline object "+PIPELINE_NAMESPACE+pipes[i]+" not found: "+e);
e.printStackTrace();
}
}
String skipTerms = null;
//add SkipTermPipeline as the first pipeline step to allow for special terms to skip the pipeline processing sequence
if ((skipTerms = ApplicationSetup.getProperty("termpipelines.skip", null)) != null && skipTerms.trim().length() > 0)
pipeline_first = new SkipTermPipeline(next, last);
else
pipeline_first = next;
}
/** Loads the builder boundary documents from the property <tt>indexing.builder.boundary.docnos</tt>, comma delimited. */
protected void load_builder_boundary_documents()
{
final String[] docnos = ApplicationSetup.getProperty("indexing.builder.boundary.docnos", "").split("\\s*,\\s*");
for(int i=0;i<docnos.length;i++)
{
docnos[i] = docnos[i].trim();
if (docnos[i].length() > 0)
BUILDER_BOUNDARY_DOCUMENTS.add(docnos[i]);
}
// if (BUILDER_BOUNDARY_DOCUMENTS.size() > 0)
// //logger.info("Watching for "+BUILDER_BOUNDARY_DOCUMENTS.size()+ " documents that force index builder boundaries.");
}
/**
* Creates the data structures for a set of collections.
* It creates a set of data structures for every
* <tt>indexing.max.docs.per.builder</tt>, if the value of
* this property is greater than zero, and then it mertges
* the generated data structures.
* @param collections The document collection objects to index.
*/
public void index(Collection[] collections) {
//the number of collections to index
final int numOfCollections = collections.length;
int counter = 0;
final String oldIndexPrefix = prefix;
//while (collections[numOfCollections-1].hasNext()) {
while (! collections[numOfCollections-1].endOfCollection()) {
counter++;
prefix = oldIndexPrefix + "_" + counter;
fileNameNoExtension = path + ApplicationSetup.FILE_SEPARATOR + prefix;
//ApplicationSetup.setupFilenames();
// //logger.info("creating the data structures " + prefix);
this.createDirectIndex(collections);
this.createInvertedIndex();
}
//merge the data structures
if (counter > 1) {
// //logger.info("merging data structures");
merge(path, oldIndexPrefix, 1, counter);
}
else
{
try{
IndexUtil.renameIndex(path, prefix, path, oldIndexPrefix);
} catch (IOException ioe ) {
// logger.error("Could not rename index", ioe);
}
}
//restore the prefix
prefix = oldIndexPrefix;
//ApplicationSetup.TERRIER_INDEX_PREFIX=oldIndexPrefix;
//ApplicationSetup.setupFilenames();
fileNameNoExtension = path + ApplicationSetup.FILE_SEPARATOR + prefix;
}
/** Merge a series of numbered indices in the same path/prefix area. New merged index
* will be stored at mpath/mprefix_highest+1.
* @param mpath Path of all indices
* @param mprefix Common prefix of all indices
* @param lowest lowest subfix of prefix
* @param highest highest subfix of prefix
*/
public static void merge(String mpath, String mprefix, int lowest, int highest)
{
//we define the counterMerged in order to
//ensure that the merged data structures will
//have different names
LinkedList<String[]> llist = new LinkedList<String[]>();
for (int i=lowest; i<=highest; i++) {
llist.add(new String[]{mpath,mprefix+ "_" + i});
}
merge(mpath, mprefix, llist, highest+1);
}
/** Merge two indices.
* @param index1 Path/Prefix of source index 1
* @param index2 Path/Prefix of source index 2
* @param outputIndex Path/Prefix of destination index
*/
protected static void mergeTwoIndices(String[] index1, String[] index2, String[] outputIndex){
StructureMerger sMerger = null;
Index src1 = Index.createIndex(index1[0], index1[1]);
Index src2 = Index.createIndex(index2[0], index2[1]);
Index dst = Index.createNewIndex(outputIndex[0], outputIndex[1]);
// //logger.info("Merging "+ src1+ " & "+ src2 +" to " + dst);
if (ApplicationSetup.BLOCK_INDEXING)
sMerger = new BlockStructureMerger(src1, src2, dst);
else
sMerger = new StructureMerger(src1, src2, dst);
//sMerger.setNumberOfBits(FieldScore.FIELDS_COUNT);
sMerger.mergeStructures();
try{
src1.close(); src2.close(); dst.close();
} catch (IOException ioe) {
// logger.error("Problem flushing index dst", ioe);
}
//delete old indices
try{
IndexUtil.deleteIndex(index1[0], index1[1]);
IndexUtil.deleteIndex(index2[0], index2[1]);
} catch (IOException ioe) {
// //logger.warn("Could not delete merge input indices ", ioe);
}
}
/** Merge a series of indices, in pair-wise fashion
* @param mpath Common path of all indices
* @param mprefix Prefix of target index
* @param counterMerged - number of indices to merge
*/
public static void merge(String mpath, String mprefix, LinkedList<String[]> llist, int counterMerged)
{
while (llist.size() > 1) {
LinkedList<String[]> tmpList = new LinkedList<String[]>();
// merge every two indices stored in the linked list
for (int i=0; i<llist.size(); i++){
String[] filename1 = llist.get(i++);
// if the first index is the end of the linked list (which means the size of the linked list
// is odd), merge with the previous merged index.
String[] filename2 = (i==llist.size())?(tmpList.removeLast()):llist.get(i);
String[] outputFilename = new String[]{mpath,mprefix + "_" + (counterMerged++)};
////logger.info("Merging "+ filename1 + " and " + filename2 + " to " + outputFilename);
mergeTwoIndices(filename1, filename2, outputFilename);
tmpList.add(outputFilename);
}
llist = tmpList; tmpList = null;
}
// //logger.info("Done merging");
//rename the generated structures
try{
IndexUtil.renameIndex(mpath, mprefix+"_"+ (counterMerged-1), mpath, mprefix);
} catch (IOException ioe) {
// logger.error("Could not rename merged index", ioe);
}
}
/** event method to be overridden by child classes */
protected void finishedDirectIndexBuild() {}
/** event method to be overridden by child classes */
protected void finishedInvertedIndexBuild() {}
/** Returns the is the index will record fields */
public boolean useFieldInformation() {
return useFieldInformation;
}
protected DocumentIndexEntry emptyDocIndexEntry = new BasicDocumentIndexEntry();
/** Adds an entry to document index for empty document @param docid, only if
IndexEmptyDocuments is set to true.
*/
protected void indexEmpty(Map<String,String> docProperties) throws IOException
{
if (! IndexEmptyDocuments)
return;
/* add doc to documentindex, even though it's empty */
// //logger.warn("Adding empty document "+docProperties.get("docno"));
docIndexBuilder.addEntryToBuffer(emptyDocIndexEntry);
metaBuilder.writeDocumentEntry(docProperties);
}
/** Utility method for merging indices */
public static void main(String args[]) throws Exception
{
if (args[0].equals("--merge") && args.length == 3)
{
merge(
ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX,
Integer.parseInt(args[1]), Integer.parseInt(args[2])
);
return;
}
// logger.error("Usage: org.terrier.indexing.Indexer --merge [lowid] [highid]");
}
}